from IPython.display import Image
Image(url='http://archive.ics.uci.edu/ml/assets/logo.gif')
import numpy as np
import pandas as pd
import seaborn as sns
from patsy import dmatrices, dmatrix
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
import plotly
import plotly.tools as tls
from plotly.offline import plot, iplot
plotly.offline.init_notebook_mode()
import cufflinks as cf
cf.go_offline()
%matplotlib inline
#DATA_DIR = '../data/'
df = pd.read_csv("winequality-red.csv", sep = ';')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): fixed acidity 1599 non-null float64 volatile acidity 1599 non-null float64 citric acid 1599 non-null float64 residual sugar 1599 non-null float64 chlorides 1599 non-null float64 free sulfur dioxide 1599 non-null float64 total sulfur dioxide 1599 non-null float64 density 1599 non-null float64 pH 1599 non-null float64 sulphates 1599 non-null float64 alcohol 1599 non-null float64 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
Data is quite clean
# Cleaning up column names
df.columns = ['fixedacid', 'volacid', 'citacid',
'ressugar', 'chlorides', 'freeso2',
'totals02', 'density', 'ph', 'sulphates', 'alcohol','quality']
df.describe()
| fixedacid | volacid | citacid | ressugar | chlorides | freeso2 | totals02 | density | ph | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
df.iplot(kind='box', subplots = True, shape = (3,4), dimensions = (980,1200), legend = False, title = 'Univariate Box Plots')
The box plots of ressugar and chlorides indicate presence of outliers which should be explored
outliers = df[(df.chlorides > 0.3) | (df.ressugar> 7)] # to be used later
df.iplot(kind='hist', subplots = True, shape = (4,3), subplot_titles=True, dimensions = (980,1000), legend = False, title = 'Univariate Histograms')
len(df[df.citacid == 0])
132
cols = df.columns[:-1]
df.iplot(kind = 'scatter',
x = 'quality',
mode = 'markers',
symbol = 'x',
title = 'Scatter Plot of Quality vs Features',
size = 4,
bestfit = False,
#keys = cols,
subplots= True,
subplot_titles = True,
dimensions = (980,1600)
)
Due to a semi-categorical nature of the dependent variable, the bi-variate relationships are tougher to interpret. However, the following relationships to the dependent variables are observed.
#Prepare X and y
cols = df.columns[:-1]
X = df[cols]
y = df['quality']
from statsmodels.formula.api import ols
Xplus = X.copy()
Xplus = sm.add_constant(Xplus)
regress = sm.OLS(y,Xplus)
results = regress.fit()
print results.summary()
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.361
Model: OLS Adj. R-squared: 0.356
Method: Least Squares F-statistic: 81.35
Date: Sun, 12 Jun 2016 Prob (F-statistic): 1.79e-145
Time: 23:56:17 Log-Likelihood: -1569.1
No. Observations: 1599 AIC: 3162.
Df Residuals: 1587 BIC: 3227.
Df Model: 11
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
const 21.9652 21.195 1.036 0.300 -19.607 63.538
fixedacid 0.0250 0.026 0.963 0.336 -0.026 0.076
volacid -1.0836 0.121 -8.948 0.000 -1.321 -0.846
citacid -0.1826 0.147 -1.240 0.215 -0.471 0.106
ressugar 0.0163 0.015 1.089 0.276 -0.013 0.046
chlorides -1.8742 0.419 -4.470 0.000 -2.697 -1.052
freeso2 0.0044 0.002 2.009 0.045 0.000 0.009
totals02 -0.0033 0.001 -4.480 0.000 -0.005 -0.002
density -17.8812 21.633 -0.827 0.409 -60.314 24.551
ph -0.4137 0.192 -2.159 0.031 -0.789 -0.038
sulphates 0.9163 0.114 8.014 0.000 0.692 1.141
alcohol 0.2762 0.026 10.429 0.000 0.224 0.328
==============================================================================
Omnibus: 27.376 Durbin-Watson: 1.757
Prob(Omnibus): 0.000 Jarque-Bera (JB): 40.965
Skew: -0.168 Prob(JB): 1.27e-09
Kurtosis: 3.708 Cond. No. 1.13e+05
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.linear_model import Ridge
est = Ridge(normalize = False, alpha =.05, fit_intercept = False)
est.fit(X,y)
est.score(X,y)
0.36006522592329782
Both sm.ols and ridge regression provide similar r-squareds
sns.pairplot(X)
<seaborn.axisgrid.PairGrid at 0x11b9bef50>
# Create correlation heatmap
sns.heatmap(np.abs(X.corr())>0.6)
<matplotlib.axes._subplots.AxesSubplot at 0x12df69490>
from sklearn import feature_selection as fs
def f_regression_feature_selection(input, response):
# use this against your feature matrix to determine p-values for
# each feature (we care about the second array it returns).
return fs.univariate_selection.f_regression(input, response)
[F,p]=f_regression_feature_selection(X, y)
ps= zip(X.columns,p)
ranked_p = pd.DataFrame(sorted(ps, key=lambda x:x[1], reverse = False), columns = ['feature', 'p'])
ranked_p
| feature | p | |
|---|---|---|
| 0 | alcohol | 2.831477e-91 |
| 1 | volacid | 2.051715e-59 |
| 2 | sulphates | 1.802088e-24 |
| 3 | citacid | 4.991295e-20 |
| 4 | totals02 | 8.621703e-14 |
| 5 | density | 1.874957e-12 |
| 6 | chlorides | 2.313383e-07 |
| 7 | fixedacid | 6.495635e-07 |
| 8 | ph | 2.096278e-02 |
| 9 | freeso2 | 4.283398e-02 |
| 10 | ressugar | 5.832180e-01 |
# Handpicked columns
feat_hp= ranked_p.feature
col_hp = list(feat_hp[:8])
col_hp
['alcohol', 'volacid', 'sulphates', 'citacid', 'totals02', 'density', 'chlorides', 'fixedacid']
scores = []
feats = []
features = feats[:]
for i in range(len(X.columns)-1):
scores = []
for feat in X.columns.difference(features):
est = Ridge()
feats = features[:]
feats.append(feat)
Xs = X[feats]
#print feats
est.fit(Xs, y)
feats = features
scores.append([feat,est.score(Xs,y)])
dfz = pd.DataFrame(scores, columns=['a','s'])
dfz.index = dfz.a
#print dfz
addfeat = dfz.s.argmax()
print ('Added feature: ' + addfeat + ', new score is: ' + str(dfz.s.max()))
features.append(addfeat)
Added feature: alcohol, new score is: 0.226734299343 Added feature: volacid, new score is: 0.316967026109 Added feature: sulphates, new score is: 0.33586639861 Added feature: totals02, new score is: 0.343751828747 Added feature: chlorides, new score is: 0.350879290299 Added feature: ph, new score is: 0.356332154507 Added feature: freeso2, new score is: 0.358615407846 Added feature: citacid, new score is: 0.35911556155 Added feature: ressugar, new score is: 0.359349520426 Added feature: fixedacid, new score is: 0.359479182064
col_fwd = features[:6]
stand_X = (X - X.mean()) / X.std() # standardised features
from sklearn.feature_selection import RFE
# Create the RFE object and rank each features
est = Ridge(alpha = 100)
rfe = RFE(estimator=est, n_features_to_select=1, step=1)
rfe.fit(stand_X, y)
ranking = rfe.ranking_
scores = zip(stand_X.columns,ranking)
scores = sorted(scores, key=lambda x:x[1])
col_rfe = [x[0] for x in scores][:6]
scores
[('alcohol', 1),
('volacid', 2),
('sulphates', 3),
('chlorides', 4),
('totals02', 5),
('ph', 6),
('freeso2', 7),
('fixedacid', 8),
('density', 9),
('ressugar', 10),
('citacid', 11)]
print 'Handpicked columns:', col_hp
print 'Columns selected via fwd selection:', col_fwd
print 'Columns selected via RFE:', col_rfe
Handpicked columns: ['alcohol', 'volacid', 'sulphates', 'citacid', 'totals02', 'density', 'chlorides', 'fixedacid'] Columns selected via fwd selection: ['alcohol', 'volacid', 'sulphates', 'totals02', 'chlorides', 'ph'] Columns selected via RFE: ['alcohol', 'volacid', 'sulphates', 'chlorides', 'totals02', 'ph']
for feats in [col_hp, col_fwd, col_rfe]:
est = Ridge()
Xs = X[feats]
print est.fit(Xs, y).score(Xs,y)
0.354070785466 0.356332154507 0.356332154507
Xs = stand_X[col_fwd]
Xs = sm.add_constant(Xs) #Add the constant 1 column to Xs for statsmodel
regress = sm.OLS(y,Xs)
results = regress.fit()
print results.summary()
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.357
Model: OLS Adj. R-squared: 0.355
Method: Least Squares F-statistic: 147.4
Date: Sun, 12 Jun 2016 Prob (F-statistic): 7.12e-149
Time: 23:40:40 Log-Likelihood: -1573.4
No. Observations: 1599 AIC: 3161.
Df Residuals: 1592 BIC: 3198.
Df Model: 6
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
const 5.6360 0.016 347.419 0.000 5.604 5.668
alcohol 0.3098 0.018 17.291 0.000 0.275 0.345
volacid -0.1859 0.018 -10.338 0.000 -0.221 -0.151
sulphates 0.1506 0.019 8.076 0.000 0.114 0.187
totals02 -0.0780 0.017 -4.684 0.000 -0.111 -0.045
chlorides -0.0942 0.019 -5.030 0.000 -0.131 -0.057
ph -0.0672 0.018 -3.750 0.000 -0.102 -0.032
==============================================================================
Omnibus: 24.314 Durbin-Watson: 1.749
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.645
Skew: -0.165 Prob(JB): 3.00e-08
Kurtosis: 3.642 Cond. No. 1.91
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = Ridge()
Xs = X[col_fwd]
est.fit(Xs,y)
yhat = est.predict(Xs)
err = pd.DataFrame(y)
err['err'] = yhat-y
err.iplot(kind='scatter', x = 'quality', y = 'err', mode='markers', size = 4, dimensions = (980,420), xTitle = 'Actual values',
yTitle = 'Residuals', title = 'Analysis of residuals' )
#err['err'].iplot(kind='hist')
sns.distplot(err['err'], hist = True, kde = True )
#sns.kdeplot(err.err, shade=True, kde = True)
<matplotlib.axes._subplots.AxesSubplot at 0x12e6552d0>
#Test to see if alphas have any impact on the fit or the scores
n_alphas = 200
alphas = np.logspace(-10, 2, n_alphas)
clf = Ridge(fit_intercept=True)
coefs = []
scores = []
for a in alphas:
clf.set_params(alpha=a)
clf.fit(Xs, y)
coefs.append(clf.coef_)
scores.append(clf.score(Xs,y))
coefsdf = pd.DataFrame(coefs, columns = Xs.columns)
coefsdf['alpha'] = alphas
coefsdf['score'] = scores
coefsdf.iplot(kind = 'scatter', x = 'alpha', dimensions = (980,500), xTitle = 'Alpha', yTitle = 'Scores and coefficients',
title = 'Impact of alpha on parameters and score', theme = 'solar')
Because visual inspection had indicated that 2nd degree terms might be useful, we add squared terms to the regression model and then standardise the features to reduce multi-collinearity
Xs = X[col_fwd]
for feat in Xs.columns:
Xs['sq_'+feat] = Xs[feat]**2
stand_Xs = (Xs - Xs.mean()) / Xs.std()
est = Ridge()
est.fit(stand_Xs,y)
score = est.score(stand_Xs,y)
print 'score = ' + str(score)
/Users/Sekhri/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
score = 0.3779701092
We find that there is a small improvement in score after adding squared terms. We look at statsmodels output to see which coefficients are significant
stand_Xs = sm.add_constant(stand_Xs)
regress = sm.OLS(y,stand_Xs)
results = regress.fit()
print results.summary()
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.378
Model: OLS Adj. R-squared: 0.374
Method: Least Squares F-statistic: 80.39
Date: Sun, 12 Jun 2016 Prob (F-statistic): 4.72e-154
Time: 23:58:09 Log-Likelihood: -1546.7
No. Observations: 1599 AIC: 3119.
Df Residuals: 1586 BIC: 3189.
Df Model: 12
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const 5.6360 0.016 352.582 0.000 5.605 5.667
alcohol 0.3740 0.282 1.327 0.185 -0.179 0.927
volacid -0.0605 0.068 -0.892 0.372 -0.194 0.072
sulphates 0.5776 0.066 8.710 0.000 0.447 0.708
totals02 -0.1051 0.042 -2.488 0.013 -0.188 -0.022
chlorides -0.1107 0.051 -2.186 0.029 -0.210 -0.011
ph 0.6482 0.448 1.448 0.148 -0.230 1.526
sq_alcohol -0.0686 0.281 -0.244 0.807 -0.620 0.482
sq_volacid -0.1000 0.067 -1.503 0.133 -0.230 0.030
sq_sulphates -0.4387 0.067 -6.538 0.000 -0.570 -0.307
sq_totals02 0.0415 0.042 0.995 0.320 -0.040 0.123
sq_chlorides 0.0349 0.051 0.691 0.490 -0.064 0.134
sq_ph -0.7318 0.447 -1.636 0.102 -1.609 0.146
==============================================================================
Omnibus: 24.345 Durbin-Watson: 1.754
Prob(Omnibus): 0.000 Jarque-Bera (JB): 35.359
Skew: -0.158 Prob(JB): 2.10e-08
Kurtosis: 3.657 Cond. No. 73.1
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Some of the variables indicate very small p_values and seemed to have unbalanced the regression. So we drop some squared terms one by one and try again.
dropcols = 'sq_ph sq_volacid sq_totals02 sq_chlorides sq_alcohol'.split()
stand_Xs.drop(dropcols, axis = 1, inplace=True)
regress = sm.OLS(y,stand_Xs)
results = regress.fit()
print results.summary()
OLS Regression Results
==============================================================================
Dep. Variable: quality R-squared: 0.376
Model: OLS Adj. R-squared: 0.373
Method: Least Squares F-statistic: 136.9
Date: Sun, 12 Jun 2016 Prob (F-statistic): 6.80e-158
Time: 23:58:17 Log-Likelihood: -1549.8
No. Observations: 1599 AIC: 3116.
Df Residuals: 1591 BIC: 3159.
Df Model: 7
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const 5.6360 0.016 352.463 0.000 5.605 5.667
alcohol 0.3016 0.018 17.045 0.000 0.267 0.336
volacid -0.1625 0.018 -9.008 0.000 -0.198 -0.127
sulphates 0.5753 0.064 8.953 0.000 0.449 0.701
totals02 -0.0670 0.016 -4.060 0.000 -0.099 -0.035
chlorides -0.0804 0.019 -4.328 0.000 -0.117 -0.044
ph -0.0841 0.018 -4.718 0.000 -0.119 -0.049
sq_sulphates -0.4424 0.064 -6.897 0.000 -0.568 -0.317
==============================================================================
Omnibus: 27.078 Durbin-Watson: 1.756
Prob(Omnibus): 0.000 Jarque-Bera (JB): 39.461
Skew: -0.176 Prob(JB): 2.70e-09
Kurtosis: 3.684 Cond. No. 8.73
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
err = pd.DataFrame(y)
est.fit(stand_Xs,y)
yhat = est.predict(stand_Xs)
err['err'] = yhat-y
err.iplot(kind='scatter', x = 'quality', y = 'err', mode='markers', size = 4)
sns.distplot(err['err'], hist = True, kde = True )
<matplotlib.axes._subplots.AxesSubplot at 0x12e71a190>
We find that the presence of one of the squared terms doesnt improve the quality of the residuals noticeably. However, given the significance of p_value, we shall use that in our final model.
Xs = X[col_fwd]
scores = [ ]
ynew = y.copy()
# Shuffle the samples
import random
for state in range(0,10000,10):
irange = range(len(X))
rng = np.random.RandomState(state)
rng.shuffle(irange)
Xs = Xs.reindex(irange)
ynew = ynew.reindex(irange)
Xtrain = Xs[:750]
ytrain = ynew[:750]
Xtest = Xs[750:]
ytest = ynew[750:]
est = Ridge()
est.fit(Xtrain,ytrain)
score = est.score(Xtest,ytest)
scores.append(score)
sns.distplot(scores)
#print 'score = ' + str(score)
#yhat = est.predict(Xs)
#err = pd.DataFrame(ynew)
#err['err'] = yhat-ynew
#err['yhat'] = yhat
#err.iplot(kind='scatter', x = 'quality', y = 'err', mode='markers', size = 4, dimensions = (980,600))
<matplotlib.axes._subplots.AxesSubplot at 0x130c629d0>
y.index
RangeIndex(start=0, stop=1599, step=1)
The final model obtained provided the following results
#creating 2 bins (0,5] being bad score and (5,10] being good score
bins = [0,5,10]
ybin = pd.cut(y, bins, labels = [0,1])
stand_X = (X-X.mean())/X.std()
data = pd.concat([stand_X,ybin], axis = 1)
dfg = data.groupby('quality').mean().T
dfg.columns = ['bad', 'good']
dfg['diff']=dfg.loc[:,'good'] - dfg.loc[:,'bad'] #Calc difference in mean value of feature between the 2 categories
dfg['absdiff']=np.abs(dfg['diff']) #calc abs value of above diff
dfg.sort_values('absdiff') # sort by abs value of diff
| bad | good | diff | absdiff | |
|---|---|---|---|---|
| ressugar | 0.002315 | -0.002015 | -0.004330 | 0.004330 |
| ph | 0.003498 | -0.003044 | -0.006542 | 0.006542 |
| freeso2 | 0.066183 | -0.057591 | -0.123773 | 0.123773 |
| fixedacid | -0.101909 | 0.088679 | 0.190587 | 0.190587 |
| chlorides | 0.117341 | -0.102108 | -0.219449 | 0.219449 |
| density | 0.170513 | -0.148376 | -0.318890 | 0.318890 |
| citacid | -0.170534 | 0.148395 | 0.318929 | 0.318929 |
| sulphates | -0.233701 | 0.203361 | 0.437061 | 0.437061 |
| totals02 | 0.248588 | -0.216315 | -0.464902 | 0.464902 |
| volacid | 0.344478 | -0.299757 | -0.644235 | 0.644235 |
| alcohol | -0.465909 | 0.405423 | 0.871332 | 0.871332 |
from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression as logistic
scores = []
feats = []
features = feats[:]
for i in range(len(stand_X.columns)-1):
scores = []
for feat in stand_X.columns.difference(features):
feats = features[:]
feats.append(feat)
stand_Xs = stand_X[feats]
#print feats
est = logistic()
est.fit(stand_Xs, ybin)
feats = features
scores.append([feat,est.score(stand_Xs, ybin)])
dfz = pd.DataFrame(scores, columns=['a','s'])
dfz.index = dfz.a
#print dfz
addfeat = dfz.s.argmax()
print ('Added feature: ' + addfeat + ', new score is: ' + str(dfz.s.max()))
features.append(addfeat)
fwdlogitcols = features[:5]
Added feature: alcohol, new score is: 0.703564727955 Added feature: volacid, new score is: 0.737961225766 Added feature: ph, new score is: 0.741088180113 Added feature: chlorides, new score is: 0.742338961851 Added feature: ressugar, new score is: 0.741713570982 Added feature: density, new score is: 0.737961225766 Added feature: totals02, new score is: 0.739212007505 Added feature: freeso2, new score is: 0.744215134459 Added feature: sulphates, new score is: 0.747342088806 Added feature: fixedacid, new score is: 0.747967479675
# Create the RFE object and rank each features
rfeest = logistic()
rfe = RFE(estimator=rfeest, n_features_to_select=1, step=1)
rfe.fit(stand_X, ybin)
ranking = rfe.ranking_
scores = zip(stand_X.columns,ranking)
scores = sorted(scores, key=lambda x:x[1])
rfelogitcols = [x[0] for x in scores][:6]
rfelogitcols
['alcohol', 'volacid', 'totals02', 'sulphates', 'citacid', 'fixedacid']
print "Linear Regression features: " + str(['alcohol', 'volacid', 'sulphates', 'chlorides', 'totals02', 'ph'])
print "Logistic Regression features:", rfelogitcols
Linear Regression features: ['alcohol', 'volacid', 'sulphates', 'chlorides', 'totals02', 'ph'] Logistic Regression features: ['alcohol', 'volacid', 'totals02', 'sulphates', 'citacid', 'fixedacid']
# Checking which feature selection is better
for feat in [logitcols, rfelogitcols]:
logist = logistic()
logist.fit(stand_X[feat],ybin)
print 'score: '+ str(logist.score(stand_X[rfelogitcols],ybin))
#reslogit = pd.DataFrame(zip(rfelogitcols,logist.coef_[0]))
#print reslogit
score: 0.459036898061 score: 0.746716697936
logitcols = [ u'volacid', u'chlorides', u'freeso2', u'totals02', u'sulphates', u'alcohol']
logit = sm.Logit(ybin, stand_X[logitcols])
result = logit.fit()
print result.summary()
Optimization terminated successfully.
Current function value: 0.525288
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: quality No. Observations: 1599
Model: Logit Df Residuals: 1593
Method: MLE Df Model: 5
Date: Mon, 13 Jun 2016 Pseudo R-squ.: 0.2395
Time: 00:06:25 Log-Likelihood: -839.94
converged: True LL-Null: -1104.5
LLR p-value: 4.167e-112
==============================================================================
coef std err z P>|z| [95.0% Conf. Int.]
------------------------------------------------------------------------------
volacid -0.5124 0.066 -7.724 0.000 -0.642 -0.382
chlorides -0.2086 0.068 -3.068 0.002 -0.342 -0.075
freeso2 0.2667 0.084 3.182 0.001 0.102 0.431
totals02 -0.6104 0.092 -6.645 0.000 -0.790 -0.430
sulphates 0.4537 0.071 6.428 0.000 0.315 0.592
alcohol 0.8664 0.072 12.114 0.000 0.726 1.007
==============================================================================
stand_Xs = stand_X.copy()
logit = sm.Logit(ybin, stand_Xs[rfelogitcols])
result = logit.fit()
print result.summary()
Optimization terminated successfully.
Current function value: 0.527167
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: quality No. Observations: 1599
Model: Logit Df Residuals: 1593
Method: MLE Df Model: 5
Date: Mon, 13 Jun 2016 Pseudo R-squ.: 0.2368
Time: 00:06:31 Log-Likelihood: -842.94
converged: True LL-Null: -1104.5
LLR p-value: 8.265e-111
==============================================================================
coef std err z P>|z| [95.0% Conf. Int.]
------------------------------------------------------------------------------
alcohol 0.9849 0.072 13.656 0.000 0.844 1.126
volacid -0.7008 0.084 -8.390 0.000 -0.865 -0.537
totals02 -0.3502 0.067 -5.235 0.000 -0.481 -0.219
sulphates 0.3868 0.064 6.091 0.000 0.262 0.511
citacid -0.3846 0.103 -3.727 0.000 -0.587 -0.182
fixedacid 0.2652 0.086 3.083 0.002 0.097 0.434
==============================================================================
All p-values are significant. Overall score as provided by Ridge model is 0.74 while pseudo-r-square is 0.2368. The p-value for the overall regression is very low indicating a low likelihood of spurious relationship. Overall the RFE features provided a better result so we stick to that.
We find that the feature selection is quite similar between both linear and logistic regressions. We find that the signs of the relationships between the variables are all preserved.
From logistic regression, we observe that alchohol content, sulphates and fixed acidity are higher in good wines and at the same time volatile acidity, total Sulphur dioxide and citric acid content are lower in the good wines.
Image(url='http://i.imgur.com/z1W5auA.gif')